{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "9a6d23f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Ch02-2 - Pitfalls of joining data with pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "4d64b416",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Libraries\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "923bc568",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/53/kmyyy3057lndfb0bpwx_2pkr0000gn/T/ipykernel_43783/2610587055.py:2: DtypeWarning: Columns (7,12,15,23) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  vdata = pd.read_csv(\"data/2021VAERSDATA.csv.gz\", encoding=\"iso-8859-1\")\n"
     ]
    }
   ],
   "source": [
    "# Jumble the data using random sampling\n",
    "vdata = pd.read_csv(\"data/2021VAERSDATA.csv.gz\", encoding=\"iso-8859-1\") \n",
    "vdata.sample(frac=0.9).to_csv(\"vdata_sample.csv.gz\", index=False) \n",
    "vax = pd.read_csv(\"data/2021VAERSVAX.csv.gz\", encoding=\"iso-8859-1\") \n",
    "vax.sample(frac=0.9).to_csv(\"vax_sample.csv.gz\", index=False) \n",
    "# Note - it is ok to get a dtype warning here"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "d0ef3b0d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(677736, 717399, 645567)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Inner join on the tables\n",
    "vdata = pd.read_csv(\"vdata_sample.csv.gz\", low_memory=False) \n",
    "vax = pd.read_csv(\"vax_sample.csv.gz\", low_memory=False) \n",
    "vdata_with_vax = vdata.join(vax.set_index(\"VAERS_ID\"), on=\"VAERS_ID\", how=\"inner\") \n",
    "len(vdata), len(vax), len(vdata_with_vax) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "9ed5a95b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>VAERS_ID</th>\n",
       "      <th>VAX_TYPE</th>\n",
       "      <th>VAX_MANU</th>\n",
       "      <th>VAX_LOT</th>\n",
       "      <th>VAX_DOSE_SERIES</th>\n",
       "      <th>VAX_ROUTE</th>\n",
       "      <th>VAX_SITE</th>\n",
       "      <th>VAX_NAME</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1186337</td>\n",
       "      <td>COVID19</td>\n",
       "      <td>MODERNA</td>\n",
       "      <td>026b21a</td>\n",
       "      <td>1</td>\n",
       "      <td>IM</td>\n",
       "      <td>LA</td>\n",
       "      <td>COVID19 (COVID19 (MODERNA))</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>943580</td>\n",
       "      <td>COVID19</td>\n",
       "      <td>MODERNA</td>\n",
       "      <td>039120A</td>\n",
       "      <td>1</td>\n",
       "      <td>IM</td>\n",
       "      <td>RA</td>\n",
       "      <td>COVID19 (COVID19 (MODERNA))</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1176243</td>\n",
       "      <td>COVID19</td>\n",
       "      <td>MODERNA</td>\n",
       "      <td>019B21A</td>\n",
       "      <td>2</td>\n",
       "      <td>IM</td>\n",
       "      <td>LA</td>\n",
       "      <td>COVID19 (COVID19 (MODERNA))</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1255767</td>\n",
       "      <td>DTAPIPVHIB</td>\n",
       "      <td>SANOFI PASTEUR</td>\n",
       "      <td>UJ162AB</td>\n",
       "      <td>3</td>\n",
       "      <td>OT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DTAP + IPV + HIB (PENTACEL)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1269613</td>\n",
       "      <td>COVID19</td>\n",
       "      <td>MODERNA</td>\n",
       "      <td>44A21A</td>\n",
       "      <td>1</td>\n",
       "      <td>IM</td>\n",
       "      <td>LA</td>\n",
       "      <td>COVID19 (COVID19 (MODERNA))</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>717394</th>\n",
       "      <td>991070</td>\n",
       "      <td>COVID19</td>\n",
       "      <td>PFIZER\\BIONTECH</td>\n",
       "      <td>EL9262</td>\n",
       "      <td>1</td>\n",
       "      <td>SYR</td>\n",
       "      <td>RA</td>\n",
       "      <td>COVID19 (COVID19 (PFIZER-BIONTECH))</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>717395</th>\n",
       "      <td>1727097</td>\n",
       "      <td>COVID19</td>\n",
       "      <td>MODERNA</td>\n",
       "      <td>038C21A</td>\n",
       "      <td>UNK</td>\n",
       "      <td>IM</td>\n",
       "      <td>LA</td>\n",
       "      <td>COVID19 (COVID19 (MODERNA))</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>717396</th>\n",
       "      <td>1222179</td>\n",
       "      <td>COVID19</td>\n",
       "      <td>PFIZER\\BIONTECH</td>\n",
       "      <td>en6200</td>\n",
       "      <td>1</td>\n",
       "      <td>IM</td>\n",
       "      <td>LA</td>\n",
       "      <td>COVID19 (COVID19 (PFIZER-BIONTECH))</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>717397</th>\n",
       "      <td>952505</td>\n",
       "      <td>COVID19</td>\n",
       "      <td>PFIZER\\BIONTECH</td>\n",
       "      <td>EL3247</td>\n",
       "      <td>1</td>\n",
       "      <td>IM</td>\n",
       "      <td>LA</td>\n",
       "      <td>COVID19 (COVID19 (PFIZER-BIONTECH))</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>717398</th>\n",
       "      <td>1259584</td>\n",
       "      <td>COVID19</td>\n",
       "      <td>PFIZER\\BIONTECH</td>\n",
       "      <td>EW0169</td>\n",
       "      <td>1</td>\n",
       "      <td>SYR</td>\n",
       "      <td>RA</td>\n",
       "      <td>COVID19 (COVID19 (PFIZER-BIONTECH))</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>717399 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        VAERS_ID    VAX_TYPE         VAX_MANU  VAX_LOT VAX_DOSE_SERIES   \n",
       "0        1186337     COVID19          MODERNA  026b21a               1  \\\n",
       "1         943580     COVID19          MODERNA  039120A               1   \n",
       "2        1176243     COVID19          MODERNA  019B21A               2   \n",
       "3        1255767  DTAPIPVHIB   SANOFI PASTEUR  UJ162AB               3   \n",
       "4        1269613     COVID19          MODERNA   44A21A               1   \n",
       "...          ...         ...              ...      ...             ...   \n",
       "717394    991070     COVID19  PFIZER\\BIONTECH   EL9262               1   \n",
       "717395   1727097     COVID19          MODERNA  038C21A             UNK   \n",
       "717396   1222179     COVID19  PFIZER\\BIONTECH   en6200               1   \n",
       "717397    952505     COVID19  PFIZER\\BIONTECH   EL3247               1   \n",
       "717398   1259584     COVID19  PFIZER\\BIONTECH   EW0169               1   \n",
       "\n",
       "       VAX_ROUTE VAX_SITE                             VAX_NAME  \n",
       "0             IM       LA          COVID19 (COVID19 (MODERNA))  \n",
       "1             IM       RA          COVID19 (COVID19 (MODERNA))  \n",
       "2             IM       LA          COVID19 (COVID19 (MODERNA))  \n",
       "3             OT      NaN          DTAP + IPV + HIB (PENTACEL)  \n",
       "4             IM       LA          COVID19 (COVID19 (MODERNA))  \n",
       "...          ...      ...                                  ...  \n",
       "717394       SYR       RA  COVID19 (COVID19 (PFIZER-BIONTECH))  \n",
       "717395        IM       LA          COVID19 (COVID19 (MODERNA))  \n",
       "717396        IM       LA  COVID19 (COVID19 (PFIZER-BIONTECH))  \n",
       "717397        IM       LA  COVID19 (COVID19 (PFIZER-BIONTECH))  \n",
       "717398       SYR       RA  COVID19 (COVID19 (PFIZER-BIONTECH))  \n",
       "\n",
       "[717399 rows x 8 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Find the data not captured by the join\n",
    "lost_vdata = vdata.loc[~vdata.index.isin(vdata_with_vax.index)] \n",
    "lost_vdata \n",
    "lost_vax = vax[~vax[\"VAERS_ID\"].isin(vdata.index)] \n",
    "lost_vax "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "13329af6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "VAERS_ID\n",
       "910642      1\n",
       "1595074     1\n",
       "1595075     1\n",
       "1595076     1\n",
       "1595079     1\n",
       "           ..\n",
       "1931705     9\n",
       "1714163     9\n",
       "1147077    10\n",
       "1147080    11\n",
       "962303     12\n",
       "Length: 677736, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Left outer join\n",
    "vdata_with_vax_left = vdata.join(vax.set_index(\"VAERS_ID\"), on=\"VAERS_ID\") \n",
    "vdata_with_vax_left.groupby(\"VAERS_ID\").size().sort_values() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "24d2f205",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1251"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Right join\n",
    "dead = vdata[vdata.DIED == \"Y\"] \n",
    "vax19 = vax[vax.VAX_TYPE == \"COVID19\"] \n",
    "vax19_dead = vax19.join(dead.set_index(\"VAERS_ID\"), on=\"VAERS_ID\", how=\"right\") \n",
    "len(vax19), len(dead), len(vax19_dead) \n",
    "len(vax19_dead[vax19_dead.VAERS_ID.duplicated()]) \n",
    "len(vax19_dead) - len(dead) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "b877c4b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Problematic lots\n",
    "vax19_dead[\"STATE\"] = vax19_dead[\"STATE\"].str.upper() \n",
    "dead_lot = vax19_dead[[\"VAERS_ID\", \"VAX_LOT\", \"STATE\"]].set_index([\"VAERS_ID\", \"VAX_LOT\"]) \n",
    "dead_lot_clean = dead_lot[~dead_lot.index.duplicated()] \n",
    "dead_lot_clean = dead_lot_clean.reset_index() \n",
    "dead_lot_clean[dead_lot_clean.VAERS_ID.isna()] \n",
    "baddies = dead_lot_clean.groupby(\"VAX_LOT\").size().sort_values(ascending=False) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "6b3cc8a5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Unknown 176 32\n",
      "EN6201 129 29\n",
      "unknown 99 20\n",
      "EN6200 99 19\n",
      "EN5318 95 23\n",
      "EN6198 85 18\n",
      "EL9261 85 18\n",
      "EN6202 80 19\n",
      "EL9269 80 19\n",
      "EL3248 79 14\n",
      "EM9810 78 19\n"
     ]
    }
   ],
   "source": [
    "# Print problematic lots\n",
    "for i, (lot, cnt) in enumerate(baddies.items()):\n",
    "    print(lot, cnt, len(dead_lot_clean[dead_lot_clean.VAX_LOT == lot].groupby(\"STATE\")))\n",
    "    if i == 10:\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d249a4e0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# End of Notebook #"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
